{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pima Indians Diabetes Data Set数据探索\n",
    "数据说明： Pima Indians Diabetes Data Set（皮马印第安人糖尿病数据集） 根据现有的医疗信息预测5年内皮马印第安人糖尿病发作的概率。\n",
    "\n",
    "数据集共9个字段: 0列为pregnants(怀孕次数)； 1列为Plasma_glucose_concentration(口服葡萄糖耐量试验中2小时后的血浆葡萄糖浓度)； 2列为blood_pressure(舒张压,单位:mm Hg） 3列为Triceps_skin_fold_thickness(三头肌皮褶厚度,单位：mm） 4列为serum_insulin(餐后血清胰岛素,单位:mm） 5列为BMI,体重指数（体重（公斤）/ 身高（米）^2） 6列为Diabetes_pedigree_function(糖尿病家系作用) 7列为Age(年龄) 8列为Target(分类变量,0或1）\n",
    "\n",
    "数据链接：https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes\n",
    "\n",
    "p.s.: Kaggle也有一个Practice Fusion Diabetes Classification任务，可以试试:) https://www.kaggle.com/c/pf2012-diabetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的库\n",
    "import numpy  as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据，检查数据规模\n",
    "train = pd.read_csv(\"pima-indians-diabetes.csv\")\n",
    "train.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train : (768, 9)\n"
     ]
    }
   ],
   "source": [
    "print(\"train :\" , train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      "pregnants                       768 non-null int64\n",
      "Plasma_glucose_concentration    768 non-null int64\n",
      "blood_pressure                  768 non-null int64\n",
      "Triceps_skin_fold_thickness     768 non-null int64\n",
      "serum_insulin                   768 non-null int64\n",
      "BMI                             768 non-null float64\n",
      "Diabetes_pedigree_function      768 non-null float64\n",
      "Age                             768 non-null int64\n",
      "Target                          768 non-null int64\n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n"
     ]
    }
   ],
   "source": [
    "#查看数据信息\n",
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "      <td>768.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.845052</td>\n",
       "      <td>120.894531</td>\n",
       "      <td>69.105469</td>\n",
       "      <td>20.536458</td>\n",
       "      <td>79.799479</td>\n",
       "      <td>31.992578</td>\n",
       "      <td>0.471876</td>\n",
       "      <td>33.240885</td>\n",
       "      <td>0.348958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>3.369578</td>\n",
       "      <td>31.972618</td>\n",
       "      <td>19.355807</td>\n",
       "      <td>15.952218</td>\n",
       "      <td>115.244002</td>\n",
       "      <td>7.884160</td>\n",
       "      <td>0.331329</td>\n",
       "      <td>11.760232</td>\n",
       "      <td>0.476951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.078000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>27.300000</td>\n",
       "      <td>0.243750</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>117.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>23.000000</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>0.372500</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>140.250000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>127.250000</td>\n",
       "      <td>36.600000</td>\n",
       "      <td>0.626250</td>\n",
       "      <td>41.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>199.000000</td>\n",
       "      <td>122.000000</td>\n",
       "      <td>99.000000</td>\n",
       "      <td>846.000000</td>\n",
       "      <td>67.100000</td>\n",
       "      <td>2.420000</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "count  768.000000                    768.000000      768.000000   \n",
       "mean     3.845052                    120.894531       69.105469   \n",
       "std      3.369578                     31.972618       19.355807   \n",
       "min      0.000000                      0.000000        0.000000   \n",
       "25%      1.000000                     99.000000       62.000000   \n",
       "50%      3.000000                    117.000000       72.000000   \n",
       "75%      6.000000                    140.250000       80.000000   \n",
       "max     17.000000                    199.000000      122.000000   \n",
       "\n",
       "       Triceps_skin_fold_thickness  serum_insulin         BMI  \\\n",
       "count                   768.000000     768.000000  768.000000   \n",
       "mean                     20.536458      79.799479   31.992578   \n",
       "std                      15.952218     115.244002    7.884160   \n",
       "min                       0.000000       0.000000    0.000000   \n",
       "25%                       0.000000       0.000000   27.300000   \n",
       "50%                      23.000000      30.500000   32.000000   \n",
       "75%                      32.000000     127.250000   36.600000   \n",
       "max                      99.000000     846.000000   67.100000   \n",
       "\n",
       "       Diabetes_pedigree_function         Age      Target  \n",
       "count                  768.000000  768.000000  768.000000  \n",
       "mean                     0.471876   33.240885    0.348958  \n",
       "std                      0.331329   11.760232    0.476951  \n",
       "min                      0.078000   21.000000    0.000000  \n",
       "25%                      0.243750   24.000000    0.000000  \n",
       "50%                      0.372500   29.000000    0.000000  \n",
       "75%                      0.626250   41.000000    1.000000  \n",
       "max                      2.420000   81.000000    1.000000  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看数值特征的基本统计量 \n",
    "train.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                       111\n",
      "Plasma_glucose_concentration      5\n",
      "blood_pressure                   35\n",
      "Triceps_skin_fold_thickness     227\n",
      "serum_insulin                   374\n",
      "BMI                              11\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "NaN_col_names = [\"pregnants\", \"Plasma_glucose_concentration\", \"blood_pressure\",\"Triceps_skin_fold_thickness\", \n",
    "                \"serum_insulin\", \"BMI\"]\n",
    "print((train[NaN_col_names] == 0).sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过describe可以看到，有些列有很多零值，表示该值无效或确实，所以我们要对这些数据进行处理\n",
    "1.对缺失比较少的我们用均值插补\n",
    "2.对缺失比较多的虚拟变量法或者和随机森林法插补"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                       0\n",
      "Plasma_glucose_concentration    0\n",
      "blood_pressure                  0\n",
      "Triceps_skin_fold_thickness     0\n",
      "serum_insulin                   0\n",
      "BMI                             0\n",
      "Diabetes_pedigree_function      0\n",
      "Age                             0\n",
      "Target                          0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#对缺失比较少的数据采用均值填补\n",
    "\n",
    "train_dummy_variable_initia = train\n",
    "NaN_col_names = [\"Plasma_glucose_concentration\", \"blood_pressure\", \"BMI\"]\n",
    "train_dummy_variable_initia[NaN_col_names] = train_dummy_variable_initia[NaN_col_names].replace(0, np.NaN)\n",
    "medians = train_dummy_variable_initia[NaN_col_names].median() \n",
    "train_dummy_variable_initia = train_dummy_variable_initia.fillna(medians)\n",
    "\n",
    "print(train_dummy_variable_initia.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "      <th>Triceps_skin_fold_thickness_Missing</th>\n",
       "      <th>pregnants_Missing</th>\n",
       "      <th>serum_insulin_Missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148.0</td>\n",
       "      <td>72.0</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                         148.0            72.0   \n",
       "1          1                          85.0            66.0   \n",
       "2          8                         183.0            64.0   \n",
       "3          1                          89.0            66.0   \n",
       "4          0                         137.0            40.0   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \\\n",
       "0                       0.627   50       1   \n",
       "1                       0.351   31       0   \n",
       "2                       0.672   32       1   \n",
       "3                       0.167   21       0   \n",
       "4                       2.288   33       1   \n",
       "\n",
       "   Triceps_skin_fold_thickness_Missing  pregnants_Missing  \\\n",
       "0                                    1                  1   \n",
       "1                                    1                  1   \n",
       "2                                    0                  1   \n",
       "3                                    1                  1   \n",
       "4                                    1                  0   \n",
       "\n",
       "   serum_insulin_Missing  \n",
       "0                      0  \n",
       "1                      0  \n",
       "2                      0  \n",
       "3                      1  \n",
       "4                      1  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#对缺失比较多的采用虚拟变量的方法测试\n",
    "#NaN_col_names_most  = [\"pregnants\", \"Triceps_skin_fold_thickness\", \"serum_insulin\"]\n",
    "\n",
    "train_dummy_variable_initia['Triceps_skin_fold_thickness_Missing'] = train_dummy_variable_initia['Triceps_skin_fold_thickness'].apply(lambda x: 1 if x != 0 else 0)\n",
    "train_dummy_variable_initia['pregnants_Missing'] = train_dummy_variable_initia['pregnants'].apply(lambda x: 1 if x != 0 else 0)\n",
    "train_dummy_variable_initia['serum_insulin_Missing'] = train_dummy_variable_initia['serum_insulin'].apply(lambda x: 1 if x != 0 else 0)\n",
    "train_dummy_variable_initia.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "y_train = train_dummy_variable_initia['Target']\n",
    "X_train = train_dummy_variable_initia.drop(['Target'], axis= 1)\n",
    "#用于保存特征工程之后的结果\n",
    "feat_names = X_train.columns\n",
    "# 初始化特征的标准化器\n",
    "ss_X = StandardScaler()\n",
    "\n",
    "X_train = ss_X.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Triceps_skin_fold_thickness_Missing</th>\n",
       "      <th>pregnants_Missing</th>\n",
       "      <th>serum_insulin_Missing</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.866045</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>0.166619</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>0.647760</td>\n",
       "      <td>0.411035</td>\n",
       "      <td>-1.026390</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.205066</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>0.530902</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-0.852200</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>0.647760</td>\n",
       "      <td>0.411035</td>\n",
       "      <td>-1.026390</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>2.016662</td>\n",
       "      <td>-0.693761</td>\n",
       "      <td>-1.288212</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-1.332500</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>-1.543781</td>\n",
       "      <td>0.411035</td>\n",
       "      <td>-1.026390</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.073567</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>0.154533</td>\n",
       "      <td>0.123302</td>\n",
       "      <td>-0.633881</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0.647760</td>\n",
       "      <td>0.411035</td>\n",
       "      <td>0.974289</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504422</td>\n",
       "      <td>-2.679076</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>0.765836</td>\n",
       "      <td>1.549303</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>0.647760</td>\n",
       "      <td>-2.432883</td>\n",
       "      <td>0.974289</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.639947                      0.866045       -0.031990   \n",
       "1  -0.844885                     -1.205066       -0.528319   \n",
       "2   1.233880                      2.016662       -0.693761   \n",
       "3  -0.844885                     -1.073567       -0.528319   \n",
       "4  -1.141852                      0.504422       -2.679076   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.907270      -0.692891  0.166619   \n",
       "1                     0.530902      -0.692891 -0.852200   \n",
       "2                    -1.288212      -0.692891 -1.332500   \n",
       "3                     0.154533       0.123302 -0.633881   \n",
       "4                     0.907270       0.765836  1.549303   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  Triceps_skin_fold_thickness_Missing  \\\n",
       "0                    0.468492  1.425995                             0.647760   \n",
       "1                   -0.365061 -0.190672                             0.647760   \n",
       "2                    0.604397 -0.105584                            -1.543781   \n",
       "3                   -0.920763 -1.041549                             0.647760   \n",
       "4                    5.484909 -0.020496                             0.647760   \n",
       "\n",
       "   pregnants_Missing  serum_insulin_Missing  Target  \n",
       "0           0.411035              -1.026390       1  \n",
       "1           0.411035              -1.026390       0  \n",
       "2           0.411035              -1.026390       1  \n",
       "3           0.411035               0.974289       0  \n",
       "4          -2.432883               0.974289       1  "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#存为csv格式\n",
    "X_train = pd.DataFrame(columns = feat_names, data = X_train)\n",
    "\n",
    "train_dummy_variable_initia = pd.concat([X_train, y_train], axis = 1)\n",
    "\n",
    "train_dummy_variable_initia.to_csv('0_FE_pima-indians-diabetes.csv',index = False,header=True)\n",
    "\n",
    "train_dummy_variable_initia.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pregnants                       0\n",
      "Plasma_glucose_concentration    0\n",
      "blood_pressure                  0\n",
      "Triceps_skin_fold_thickness     0\n",
      "serum_insulin                   0\n",
      "BMI                             0\n",
      "Diabetes_pedigree_function      0\n",
      "Age                             0\n",
      "Target                          0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#直接用中值或者平均值填补\n",
    "train = pd.read_csv(\"pima-indians-diabetes.csv\")\n",
    "NaN_col_names = ['Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI']\n",
    "train[NaN_col_names] = train[NaN_col_names].replace(0, np.NaN)\n",
    "medians = train.median()\n",
    "train = train.fillna(medians)\n",
    "print(train.isnull().sum())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.866045</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.166619</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.205066</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.852200</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>2.016662</td>\n",
       "      <td>-0.693761</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-1.332500</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.073567</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-0.695245</td>\n",
       "      <td>-0.540642</td>\n",
       "      <td>-0.633881</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504422</td>\n",
       "      <td>-2.679076</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>0.316566</td>\n",
       "      <td>1.549303</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.342981</td>\n",
       "      <td>-0.185948</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.997745</td>\n",
       "      <td>-0.818079</td>\n",
       "      <td>-0.275760</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>-0.250952</td>\n",
       "      <td>-1.435189</td>\n",
       "      <td>-1.851862</td>\n",
       "      <td>0.329171</td>\n",
       "      <td>-0.610145</td>\n",
       "      <td>-0.211799</td>\n",
       "      <td>-0.676133</td>\n",
       "      <td>-0.616111</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1.827813</td>\n",
       "      <td>-0.218823</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.414047</td>\n",
       "      <td>-1.020427</td>\n",
       "      <td>-0.360847</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-0.547919</td>\n",
       "      <td>2.476909</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>1.808882</td>\n",
       "      <td>4.660524</td>\n",
       "      <td>-0.284572</td>\n",
       "      <td>-0.947944</td>\n",
       "      <td>1.681259</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>0.109925</td>\n",
       "      <td>1.953325</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.022590</td>\n",
       "      <td>-0.724455</td>\n",
       "      <td>1.766346</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>0.046014</td>\n",
       "      <td>-0.383197</td>\n",
       "      <td>1.622439</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.748802</td>\n",
       "      <td>-0.848280</td>\n",
       "      <td>-0.275760</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1.827813</td>\n",
       "      <td>1.523540</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.807020</td>\n",
       "      <td>0.196681</td>\n",
       "      <td>0.064591</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1.827813</td>\n",
       "      <td>0.570172</td>\n",
       "      <td>0.629782</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.779427</td>\n",
       "      <td>2.926869</td>\n",
       "      <td>2.021610</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>2.213910</td>\n",
       "      <td>-1.024647</td>\n",
       "      <td>-0.695245</td>\n",
       "      <td>8.170442</td>\n",
       "      <td>-0.342790</td>\n",
       "      <td>-0.223115</td>\n",
       "      <td>2.191785</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.342981</td>\n",
       "      <td>1.457791</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>-1.150541</td>\n",
       "      <td>0.397653</td>\n",
       "      <td>-0.968636</td>\n",
       "      <td>0.347687</td>\n",
       "      <td>1.511083</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>0.936914</td>\n",
       "      <td>-0.711944</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.357345</td>\n",
       "      <td>0.036615</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>-0.120198</td>\n",
       "      <td>0.960667</td>\n",
       "      <td>2.036530</td>\n",
       "      <td>1.034767</td>\n",
       "      <td>1.942276</td>\n",
       "      <td>0.238963</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>0.936914</td>\n",
       "      <td>-0.481821</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.415563</td>\n",
       "      <td>-0.658012</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.613320</td>\n",
       "      <td>-3.506291</td>\n",
       "      <td>1.012114</td>\n",
       "      <td>-0.668065</td>\n",
       "      <td>1.578412</td>\n",
       "      <td>-0.872441</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.218823</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>0.101523</td>\n",
       "      <td>-0.517474</td>\n",
       "      <td>0.312165</td>\n",
       "      <td>0.172520</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>-0.250952</td>\n",
       "      <td>0.142800</td>\n",
       "      <td>1.291553</td>\n",
       "      <td>1.353586</td>\n",
       "      <td>1.092686</td>\n",
       "      <td>0.996229</td>\n",
       "      <td>0.701041</td>\n",
       "      <td>-0.531023</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>-0.744819</td>\n",
       "      <td>0.960667</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.428601</td>\n",
       "      <td>-0.253316</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>0.936914</td>\n",
       "      <td>2.444034</td>\n",
       "      <td>1.456996</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>1.069002</td>\n",
       "      <td>-0.063049</td>\n",
       "      <td>0.660206</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1.530847</td>\n",
       "      <td>-0.087324</td>\n",
       "      <td>0.629782</td>\n",
       "      <td>0.670643</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.502890</td>\n",
       "      <td>-0.630831</td>\n",
       "      <td>-0.360847</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>2.124780</td>\n",
       "      <td>0.701671</td>\n",
       "      <td>1.787882</td>\n",
       "      <td>0.442995</td>\n",
       "      <td>0.061720</td>\n",
       "      <td>0.603256</td>\n",
       "      <td>-0.658012</td>\n",
       "      <td>1.511083</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>1.827813</td>\n",
       "      <td>0.109925</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>-0.353773</td>\n",
       "      <td>-0.297380</td>\n",
       "      <td>-0.197245</td>\n",
       "      <td>-0.805998</td>\n",
       "      <td>0.660206</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>0.936914</td>\n",
       "      <td>0.833170</td>\n",
       "      <td>0.298896</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>1.010784</td>\n",
       "      <td>-0.648952</td>\n",
       "      <td>0.830381</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.810569</td>\n",
       "      <td>-0.528319</td>\n",
       "      <td>-1.605837</td>\n",
       "      <td>-0.007783</td>\n",
       "      <td>-1.347055</td>\n",
       "      <td>0.045675</td>\n",
       "      <td>-0.956462</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>2.718712</td>\n",
       "      <td>0.767420</td>\n",
       "      <td>0.795225</td>\n",
       "      <td>-1.150541</td>\n",
       "      <td>-0.355300</td>\n",
       "      <td>-1.492600</td>\n",
       "      <td>-0.685193</td>\n",
       "      <td>2.021610</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>0.342981</td>\n",
       "      <td>-0.153073</td>\n",
       "      <td>1.622439</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.239392</td>\n",
       "      <td>-0.407342</td>\n",
       "      <td>0.404942</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>738</th>\n",
       "      <td>-0.547919</td>\n",
       "      <td>-0.744819</td>\n",
       "      <td>-1.024647</td>\n",
       "      <td>-1.378189</td>\n",
       "      <td>0.223895</td>\n",
       "      <td>0.603256</td>\n",
       "      <td>-0.057009</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>739</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.646195</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>1.025338</td>\n",
       "      <td>-0.540228</td>\n",
       "      <td>0.745293</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>740</th>\n",
       "      <td>2.124780</td>\n",
       "      <td>-0.054449</td>\n",
       "      <td>0.629782</td>\n",
       "      <td>0.898290</td>\n",
       "      <td>0.108056</td>\n",
       "      <td>1.432866</td>\n",
       "      <td>0.945671</td>\n",
       "      <td>1.255820</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>741</th>\n",
       "      <td>-0.250952</td>\n",
       "      <td>-0.646195</td>\n",
       "      <td>-2.348190</td>\n",
       "      <td>-1.036717</td>\n",
       "      <td>-0.540642</td>\n",
       "      <td>-0.240908</td>\n",
       "      <td>-0.217075</td>\n",
       "      <td>-0.616111</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>742</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.416071</td>\n",
       "      <td>-1.190090</td>\n",
       "      <td>-1.264365</td>\n",
       "      <td>-0.285796</td>\n",
       "      <td>-0.575663</td>\n",
       "      <td>-0.763716</td>\n",
       "      <td>-0.956462</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>743</th>\n",
       "      <td>1.530847</td>\n",
       "      <td>0.603047</td>\n",
       "      <td>1.787882</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.035628</td>\n",
       "      <td>0.791645</td>\n",
       "      <td>1.000557</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>744</th>\n",
       "      <td>2.718712</td>\n",
       "      <td>1.030419</td>\n",
       "      <td>1.291553</td>\n",
       "      <td>0.898290</td>\n",
       "      <td>-0.007783</td>\n",
       "      <td>1.185439</td>\n",
       "      <td>2.120497</td>\n",
       "      <td>0.490030</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>745</th>\n",
       "      <td>2.421746</td>\n",
       "      <td>-0.711944</td>\n",
       "      <td>0.960667</td>\n",
       "      <td>0.442995</td>\n",
       "      <td>-0.413219</td>\n",
       "      <td>-0.357345</td>\n",
       "      <td>0.048695</td>\n",
       "      <td>1.085644</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>746</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>0.833170</td>\n",
       "      <td>1.787882</td>\n",
       "      <td>1.353586</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>2.451685</td>\n",
       "      <td>-0.343920</td>\n",
       "      <td>-0.531023</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>747</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.336565</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>1.353586</td>\n",
       "      <td>-0.969246</td>\n",
       "      <td>2.015048</td>\n",
       "      <td>1.884928</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>748</th>\n",
       "      <td>-0.250952</td>\n",
       "      <td>2.148161</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>-0.809069</td>\n",
       "      <td>0.687250</td>\n",
       "      <td>0.574147</td>\n",
       "      <td>-0.192914</td>\n",
       "      <td>0.234767</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>749</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>1.326292</td>\n",
       "      <td>-0.859204</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-1.186955</td>\n",
       "      <td>-0.887541</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>750</th>\n",
       "      <td>0.046014</td>\n",
       "      <td>0.471547</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.182690</td>\n",
       "      <td>2.144658</td>\n",
       "      <td>-0.956462</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>751</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.021574</td>\n",
       "      <td>0.464339</td>\n",
       "      <td>1.125938</td>\n",
       "      <td>-0.772320</td>\n",
       "      <td>0.952566</td>\n",
       "      <td>-0.636871</td>\n",
       "      <td>-0.445935</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>752</th>\n",
       "      <td>-0.250952</td>\n",
       "      <td>-0.448946</td>\n",
       "      <td>-0.859204</td>\n",
       "      <td>-0.581421</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.939527</td>\n",
       "      <td>-0.751636</td>\n",
       "      <td>-0.701198</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>753</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>1.950912</td>\n",
       "      <td>1.291553</td>\n",
       "      <td>1.695058</td>\n",
       "      <td>4.278256</td>\n",
       "      <td>1.578412</td>\n",
       "      <td>-0.754656</td>\n",
       "      <td>-0.616111</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>754</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>1.063293</td>\n",
       "      <td>0.464339</td>\n",
       "      <td>0.329171</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.008035</td>\n",
       "      <td>-0.087210</td>\n",
       "      <td>1.000557</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>755</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>0.208549</td>\n",
       "      <td>1.291553</td>\n",
       "      <td>1.125938</td>\n",
       "      <td>-0.355300</td>\n",
       "      <td>0.588702</td>\n",
       "      <td>1.767143</td>\n",
       "      <td>0.319855</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>756</th>\n",
       "      <td>0.936914</td>\n",
       "      <td>0.504422</td>\n",
       "      <td>1.456996</td>\n",
       "      <td>1.353586</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.066254</td>\n",
       "      <td>-0.244256</td>\n",
       "      <td>0.490030</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>757</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.044175</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.559592</td>\n",
       "      <td>-0.645932</td>\n",
       "      <td>1.596171</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>758</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.514696</td>\n",
       "      <td>0.298896</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.734247</td>\n",
       "      <td>-0.830159</td>\n",
       "      <td>-0.616111</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>2.246785</td>\n",
       "      <td>1.622439</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.443156</td>\n",
       "      <td>-0.585529</td>\n",
       "      <td>2.787399</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>760</th>\n",
       "      <td>-0.547919</td>\n",
       "      <td>-1.106442</td>\n",
       "      <td>-1.190090</td>\n",
       "      <td>-0.353773</td>\n",
       "      <td>-1.444185</td>\n",
       "      <td>-0.590218</td>\n",
       "      <td>0.888288</td>\n",
       "      <td>-0.956462</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>761</th>\n",
       "      <td>1.530847</td>\n",
       "      <td>1.589290</td>\n",
       "      <td>0.133453</td>\n",
       "      <td>0.215347</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>1.680294</td>\n",
       "      <td>-0.208015</td>\n",
       "      <td>0.830381</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>762</th>\n",
       "      <td>1.530847</td>\n",
       "      <td>-1.073567</td>\n",
       "      <td>-0.859204</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-1.448937</td>\n",
       "      <td>-0.996266</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>763</th>\n",
       "      <td>1.827813</td>\n",
       "      <td>-0.679069</td>\n",
       "      <td>0.298896</td>\n",
       "      <td>2.150354</td>\n",
       "      <td>0.455573</td>\n",
       "      <td>0.064737</td>\n",
       "      <td>-0.908682</td>\n",
       "      <td>2.532136</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>764</th>\n",
       "      <td>-0.547919</td>\n",
       "      <td>0.011301</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>-0.239949</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>0.632365</td>\n",
       "      <td>-0.398282</td>\n",
       "      <td>-0.531023</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>765</th>\n",
       "      <td>0.342981</td>\n",
       "      <td>-0.021574</td>\n",
       "      <td>-0.031990</td>\n",
       "      <td>-0.695245</td>\n",
       "      <td>-0.332132</td>\n",
       "      <td>-0.910418</td>\n",
       "      <td>-0.685193</td>\n",
       "      <td>-0.275760</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>766</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>0.142800</td>\n",
       "      <td>-1.024647</td>\n",
       "      <td>-0.012301</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.342790</td>\n",
       "      <td>-0.371101</td>\n",
       "      <td>1.170732</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.942068</td>\n",
       "      <td>-0.197433</td>\n",
       "      <td>0.215347</td>\n",
       "      <td>-0.181541</td>\n",
       "      <td>-0.299127</td>\n",
       "      <td>-0.473785</td>\n",
       "      <td>-0.871374</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>768 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0     0.639947                      0.866045       -0.031990   \n",
       "1    -0.844885                     -1.205066       -0.528319   \n",
       "2     1.233880                      2.016662       -0.693761   \n",
       "3    -0.844885                     -1.073567       -0.528319   \n",
       "4    -1.141852                      0.504422       -2.679076   \n",
       "5     0.342981                     -0.185948        0.133453   \n",
       "6    -0.250952                     -1.435189       -1.851862   \n",
       "7     1.827813                     -0.218823       -0.031990   \n",
       "8    -0.547919                      2.476909       -0.197433   \n",
       "9     1.233880                      0.109925        1.953325   \n",
       "10    0.046014                     -0.383197        1.622439   \n",
       "11    1.827813                      1.523540        0.133453   \n",
       "12    1.827813                      0.570172        0.629782   \n",
       "13   -0.844885                      2.213910       -1.024647   \n",
       "14    0.342981                      1.457791       -0.031990   \n",
       "15    0.936914                     -0.711944       -0.031990   \n",
       "16   -1.141852                     -0.120198        0.960667   \n",
       "17    0.936914                     -0.481821        0.133453   \n",
       "18   -0.844885                     -0.613320       -3.506291   \n",
       "19   -0.844885                     -0.218823       -0.197433   \n",
       "20   -0.250952                      0.142800        1.291553   \n",
       "21    1.233880                     -0.744819        0.960667   \n",
       "22    0.936914                      2.444034        1.456996   \n",
       "23    1.530847                     -0.087324        0.629782   \n",
       "24    2.124780                      0.701671        1.787882   \n",
       "25    1.827813                      0.109925       -0.197433   \n",
       "26    0.936914                      0.833170        0.298896   \n",
       "27   -0.844885                     -0.810569       -0.528319   \n",
       "28    2.718712                      0.767420        0.795225   \n",
       "29    0.342981                     -0.153073        1.622439   \n",
       "..         ...                           ...             ...   \n",
       "738  -0.547919                     -0.744819       -1.024647   \n",
       "739  -0.844885                     -0.646195        0.133453   \n",
       "740   2.124780                     -0.054449        0.629782   \n",
       "741  -0.250952                     -0.646195       -2.348190   \n",
       "742  -0.844885                     -0.416071       -1.190090   \n",
       "743   1.530847                      0.603047        1.787882   \n",
       "744   2.718712                      1.030419        1.291553   \n",
       "745   2.421746                     -0.711944        0.960667   \n",
       "746  -0.844885                      0.833170        1.787882   \n",
       "747  -0.844885                     -1.336565        0.133453   \n",
       "748  -0.250952                      2.148161       -0.197433   \n",
       "749   0.639947                      1.326292       -0.859204   \n",
       "750   0.046014                      0.471547       -0.197433   \n",
       "751  -0.844885                     -0.021574        0.464339   \n",
       "752  -0.250952                     -0.448946       -0.859204   \n",
       "753  -1.141852                      1.950912        1.291553   \n",
       "754   1.233880                      1.063293        0.464339   \n",
       "755  -0.844885                      0.208549        1.291553   \n",
       "756   0.936914                      0.504422        1.456996   \n",
       "757  -1.141852                      0.044175       -0.031990   \n",
       "758  -0.844885                     -0.514696        0.298896   \n",
       "759   0.639947                      2.246785        1.622439   \n",
       "760  -0.547919                     -1.106442       -1.190090   \n",
       "761   1.530847                      1.589290        0.133453   \n",
       "762   1.530847                     -1.073567       -0.859204   \n",
       "763   1.827813                     -0.679069        0.298896   \n",
       "764  -0.547919                      0.011301       -0.197433   \n",
       "765   0.342981                     -0.021574       -0.031990   \n",
       "766  -0.844885                      0.142800       -1.024647   \n",
       "767  -0.844885                     -0.942068       -0.197433   \n",
       "\n",
       "     Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                       0.670643      -0.181541  0.166619   \n",
       "1                      -0.012301      -0.181541 -0.852200   \n",
       "2                      -0.012301      -0.181541 -1.332500   \n",
       "3                      -0.695245      -0.540642 -0.633881   \n",
       "4                       0.670643       0.316566  1.549303   \n",
       "5                      -0.012301      -0.181541 -0.997745   \n",
       "6                       0.329171      -0.610145 -0.211799   \n",
       "7                      -0.012301      -0.181541  0.414047   \n",
       "8                       1.808882       4.660524 -0.284572   \n",
       "9                      -0.012301      -0.181541 -0.022590   \n",
       "10                     -0.012301      -0.181541  0.748802   \n",
       "11                     -0.012301      -0.181541  0.807020   \n",
       "12                     -0.012301      -0.181541 -0.779427   \n",
       "13                     -0.695245       8.170442 -0.342790   \n",
       "14                     -1.150541       0.397653 -0.968636   \n",
       "15                     -0.012301      -0.181541 -0.357345   \n",
       "16                      2.036530       1.034767  1.942276   \n",
       "17                     -0.012301      -0.181541 -0.415563   \n",
       "18                      1.012114      -0.668065  1.578412   \n",
       "19                      0.101523      -0.517474  0.312165   \n",
       "20                      1.353586       1.092686  0.996229   \n",
       "21                     -0.012301      -0.181541  0.428601   \n",
       "22                     -0.012301      -0.181541  1.069002   \n",
       "23                      0.670643      -0.181541 -0.502890   \n",
       "24                      0.442995       0.061720  0.603256   \n",
       "25                     -0.353773      -0.297380 -0.197245   \n",
       "26                     -0.012301      -0.181541  1.010784   \n",
       "27                     -1.605837      -0.007783 -1.347055   \n",
       "28                     -1.150541      -0.355300 -1.492600   \n",
       "29                     -0.012301      -0.181541  0.239392   \n",
       "..                           ...            ...       ...   \n",
       "738                    -1.378189       0.223895  0.603256   \n",
       "739                    -0.012301      -0.181541  1.025338   \n",
       "740                     0.898290       0.108056  1.432866   \n",
       "741                    -1.036717      -0.540642 -0.240908   \n",
       "742                    -1.264365      -0.285796 -0.575663   \n",
       "743                    -0.012301      -0.181541  0.035628   \n",
       "744                     0.898290      -0.007783  1.185439   \n",
       "745                     0.442995      -0.413219 -0.357345   \n",
       "746                     1.353586      -0.181541  2.451685   \n",
       "747                     1.353586      -0.969246  2.015048   \n",
       "748                    -0.809069       0.687250  0.574147   \n",
       "749                    -0.012301      -0.181541 -1.186955   \n",
       "750                    -0.012301      -0.181541 -0.182690   \n",
       "751                     1.125938      -0.772320  0.952566   \n",
       "752                    -0.581421      -0.181541 -0.939527   \n",
       "753                     1.695058       4.278256  1.578412   \n",
       "754                     0.329171      -0.181541 -0.008035   \n",
       "755                     1.125938      -0.355300  0.588702   \n",
       "756                     1.353586      -0.181541 -0.066254   \n",
       "757                    -0.012301      -0.181541  0.559592   \n",
       "758                    -0.012301      -0.181541  0.734247   \n",
       "759                    -0.012301      -0.181541  0.443156   \n",
       "760                    -0.353773      -1.444185 -0.590218   \n",
       "761                     0.215347      -0.181541  1.680294   \n",
       "762                    -0.012301      -0.181541 -1.448937   \n",
       "763                     2.150354       0.455573  0.064737   \n",
       "764                    -0.239949      -0.181541  0.632365   \n",
       "765                    -0.695245      -0.332132 -0.910418   \n",
       "766                    -0.012301      -0.181541 -0.342790   \n",
       "767                     0.215347      -0.181541 -0.299127   \n",
       "\n",
       "     Diabetes_pedigree_function       Age  Target  \n",
       "0                      0.468492  1.425995       1  \n",
       "1                     -0.365061 -0.190672       0  \n",
       "2                      0.604397 -0.105584       1  \n",
       "3                     -0.920763 -1.041549       0  \n",
       "4                      5.484909 -0.020496       1  \n",
       "5                     -0.818079 -0.275760       0  \n",
       "6                     -0.676133 -0.616111       1  \n",
       "7                     -1.020427 -0.360847       0  \n",
       "8                     -0.947944  1.681259       1  \n",
       "9                     -0.724455  1.766346       1  \n",
       "10                    -0.848280 -0.275760       0  \n",
       "11                     0.196681  0.064591       1  \n",
       "12                     2.926869  2.021610       0  \n",
       "13                    -0.223115  2.191785       1  \n",
       "14                     0.347687  1.511083       1  \n",
       "15                     0.036615 -0.105584       1  \n",
       "16                     0.238963 -0.190672       1  \n",
       "17                    -0.658012 -0.190672       1  \n",
       "18                    -0.872441 -0.020496       0  \n",
       "19                     0.172520 -0.105584       1  \n",
       "20                     0.701041 -0.531023       0  \n",
       "21                    -0.253316  1.425995       0  \n",
       "22                    -0.063049  0.660206       1  \n",
       "23                    -0.630831 -0.360847       1  \n",
       "24                    -0.658012  1.511083       1  \n",
       "25                    -0.805998  0.660206       1  \n",
       "26                    -0.648952  0.830381       1  \n",
       "27                     0.045675 -0.956462       0  \n",
       "28                    -0.685193  2.021610       0  \n",
       "29                    -0.407342  0.404942       0  \n",
       "..                          ...       ...     ...  \n",
       "738                   -0.057009 -1.041549       0  \n",
       "739                   -0.540228  0.745293       1  \n",
       "740                    0.945671  1.255820       1  \n",
       "741                   -0.217075 -0.616111       0  \n",
       "742                   -0.763716 -0.956462       0  \n",
       "743                    0.791645  1.000557       1  \n",
       "744                    2.120497  0.490030       0  \n",
       "745                    0.048695  1.085644       0  \n",
       "746                   -0.343920 -0.531023       1  \n",
       "747                    1.884928 -0.105584       0  \n",
       "748                   -0.192914  0.234767       1  \n",
       "749                   -0.887541  1.425995       1  \n",
       "750                    2.144658 -0.956462       1  \n",
       "751                   -0.636871 -0.445935       0  \n",
       "752                   -0.751636 -0.701198       0  \n",
       "753                   -0.754656 -0.616111       1  \n",
       "754                   -0.087210  1.000557       1  \n",
       "755                    1.767143  0.319855       1  \n",
       "756                   -0.244256  0.490030       0  \n",
       "757                   -0.645932  1.596171       1  \n",
       "758                   -0.830159 -0.616111       0  \n",
       "759                   -0.585529  2.787399       1  \n",
       "760                    0.888288 -0.956462       0  \n",
       "761                   -0.208015  0.830381       1  \n",
       "762                   -0.996266 -0.020496       0  \n",
       "763                   -0.908682  2.532136       0  \n",
       "764                   -0.398282 -0.531023       0  \n",
       "765                   -0.685193 -0.275760       0  \n",
       "766                   -0.371101  1.170732       1  \n",
       "767                   -0.473785 -0.871374       0  \n",
       "\n",
       "[768 rows x 9 columns]"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train = train['Target']\n",
    "X_train = train.drop(['Target'], axis= 1)\n",
    "#用于保存特征工程之后的结果\n",
    "feat_names = X_train.columns\n",
    "# 初始化特征的标准化器\n",
    "ss_X = StandardScaler()\n",
    "\n",
    "X_train = ss_X.fit_transform(X_train)\n",
    "\n",
    "#存为csv格式\n",
    "X_train = pd.DataFrame(columns = feat_names, data = X_train)\n",
    "\n",
    "train = pd.concat([X_train, y_train], axis = 1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
